Load libraries

knitr::opts_chunk$set(echo = TRUE)

list.of.packages <- c("gsheet", "tidyverse", "janitor", "plotly", "glmmTMB", "metafor", "broom.mixed") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})
sessionInfo()

Prepare data

NOTE: data is read directly from the GoogleSheet using a share link that was set to “anyone with a link can view”

# Read in data from GoogleSheet 
data.fert <- as_tibble(gsheet2tbl('https://docs.google.com/spreadsheets/d/111SuH548Et6HDckjbQjtYk-B8A5VfNkUYZgcUNRPxxY/edit?usp=sharing'))
## Warning: Missing column names filled in: 'X22' [22]
#replace NR (not reported) with NA and convert columns to factor / numeric where needed 
data.fert <- data.fert %>% 
  na_if("NR") %>%     
  mutate_at(c('Phylum', 'Study', 'Taxonomic Group', 'Common name', 'Latin name', 'Error statistic'), as.factor) %>%
  mutate_at(c('pH Experimental', 'pH Control', 'pCO2 Experimental', 'pCO2 Control', 'Ave. Fert. % @ pH', 'Error % @ pH', '# Trials @ pH', 'Insemination time'), as.numeric) %>%
  clean_names()  # fill in spaces with underscores for column names 
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
str(data.fert)   
## Classes 'tbl_df', 'tbl' and 'data.frame':    334 obs. of  22 variables:
##  $ phylum                  : Factor w/ 4 levels "Cnidaria","Crustacea",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ paper_link              : chr  "https://doi.org/10.1016/j.jembe.2012.12.014" "https://doi.org/10.1016/j.jembe.2012.12.014" "https://doi.org/10.1016/j.jembe.2012.12.014" "https://doi.org/10.1080/15287394.2011.550460" ...
##  $ study                   : Factor w/ 37 levels "Albright, 2011",..: 3 3 3 4 4 5 5 6 6 6 ...
##  $ taxonomic_group         : Factor w/ 10 levels "abalone","clam",..: 8 8 8 5 5 8 8 2 2 2 ...
##  $ common_name             : Factor w/ 30 levels "amphipod","Antarctic bivalve",..: 21 21 21 7 7 11 11 2 2 2 ...
##  $ latin_name              : Factor w/ 41 levels "Acartia erythraea",..: 12 12 12 29 29 13 13 23 23 23 ...
##  $ p_h_experimental        : num  7.37 7.76 8.09 7.6 8.1 7.5 7.9 7.63 7.63 7.63 ...
##  $ p_h_control             : num  8.09 8.09 8.09 8.1 8.1 7.9 7.9 7.97 7.97 7.97 ...
##  $ p_co2_experimental      : num  3573 1386 580 1440 444 ...
##  $ p_co2_control           : num  580 580 580 444 444 ...
##  $ ave_fert_percent_p_h    : num  41 80 90 98 99 74 80 51.5 66.4 73.6 ...
##  $ error_percent_p_h       : num  NA NA NA NA NA 4 5 1 3.1 2.7 ...
##  $ error_statistic         : Factor w/ 5 levels "95% CI","MAD",..: NA NA NA NA NA 3 3 4 4 4 ...
##  $ number_trials_p_h       : num  3 3 3 1 1 6 6 NA NA NA ...
##  $ number_individuals_trial: chr  "Egg and sperm pools" "Egg and sperm pools" "Egg and sperm pools" "658 eggs (pooled from individuals)" ...
##  $ insemination_time       : num  240 240 240 120 120 2880 2880 240 360 1440 ...
##  $ sperm_per_m_l           : chr  NA NA NA NA ...
##  $ sperm_egg_ratio         : chr  "10" "10" "10" NA ...
##  $ number_females          : chr  "8" "8" "8" "6" ...
##  $ number_males            : chr  "11" "11" "11" "8" ...
##  $ other                   : chr  "Also 120 minute egg pre-incubation in experimental water" "Also 120 minute egg pre-incubation in experimental water" "Also 120 minute egg pre-incubation in experimental water" NA ...
##  $ x22                     : num  NA NA NA NA NA NA NA NA NA NA ...

Explore data with plotly figures

ggplotly(data.fert %>%
ggplot(mapping=aes(x=p_h_experimental, y=ave_fert_percent_p_h, group=taxonomic_group, col=taxonomic_group, text=`common_name`)) + 
  geom_point(size=1.5, width=0.02) +
  facet_wrap(~phylum, scale="free") +
  geom_smooth(method="lm", se=TRUE, aes(fill=taxonomic_group)) +
  ggtitle("Fertilization Rate ~ pH exposure by phylum, linear") +
  theme_minimal())
## Warning: Ignoring unknown parameters: width
## Warning: Removed 25 rows containing non-finite values (stat_smooth).
ggplotly(data.fert %>%
ggplot(mapping=aes(x=p_h_experimental, y=ave_fert_percent_p_h, group=taxonomic_group, col=taxonomic_group, text=`common_name`)) + 
  geom_point(size=1, width=0.02) +
  facet_wrap(~phylum, scale="free") +
  geom_smooth(method="lm", se=TRUE, formula=y ~ poly(x, 2, raw=TRUE), aes(fill=taxonomic_group)) +
  ggtitle("Fertilization Rate ~ pH exposure by phylum, polynomial") +
  theme_minimal())
## Warning: Ignoring unknown parameters: width

## Warning: Removed 25 rows containing non-finite values (stat_smooth).
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Convert all error values to separate SE & SD columns

95% Confidence Interval

Upper 95% CI = Mean + 1.96*SE; I recorded the difference between the upper 95%CI and the mean (Upper 95%CI - Mean). To convert I will use:
SE = (Upper 95%CI - Mean) / 1.96

SD = ((Upper 95%CI - Mean) / 1.96) * sqrt(n)

Standard Deviation

SE= SD/sqrt(n); where n=sample size. To convert I will use that equation (since I recorded sample size, i.e. number of trials at each pH)

SD = SE*sqrt(n)

Cases where I do not know the type of error statistic reported

I will use that statistic as-is (no conversion), thereby assuming it is SE.

data.fert <- data.fert %>% 
  mutate(SE =  case_when(error_statistic == "SD" ~ error_percent_p_h/sqrt(number_trials_p_h), 
         error_statistic == "95% CI" ~ error_percent_p_h/1.96,
         is.na(error_statistic) ~ error_percent_p_h, 
         error_statistic == "SE" ~ error_percent_p_h)) 

data.fert <- data.fert %>% 
  mutate(SD =  case_when(error_statistic == "SE" ~ error_percent_p_h*sqrt(number_trials_p_h), 
         error_statistic == "95% CI" ~ (error_percent_p_h/1.96)*sqrt(number_trials_p_h),
         is.na(error_statistic) ~ error_percent_p_h, 
         error_statistic == "SD" ~ error_percent_p_h)) 

data.fert %>% View()

Calculate pH experimental - pH control

data.fert <- data.fert %>% 
  mutate(pH_delta = p_h_control-p_h_experimental)

Convert % fertilization data to proportion fertilized, and replace any values >1 (aka 100% fertilized) with 1

data.fert <- data.fert %>% 
  mutate(ave_fert_proport = case_when(ave_fert_percent_p_h <= 100 ~ ave_fert_percent_p_h/100,
                                      ave_fert_percent_p_h > 100 ~ 1))

Transform proportion fertilized data to remove 1’s

Transormation equation source: https://cran.r-project.org/web/packages/betareg/vignettes/betareg.pdf, " … if y also assumes the extremes 0 and 1, a useful transformation in practice is (y · (n − 1) + 0.5)/n where n is the sample size (Smithson and Verkuilen 2006)."

HOWEVER

issue is that a few studies only had 1 trial per pH, so the transformation results in NA values

 # data.fert$ave_fert_proport.t <- data.fert$ave_fert_proport*((data.fert$number_trials_p_h-1) + 0.5) / data.fert$number_trials_p_h

Instead, I simply subtracted 0.001 from all data (but not sure if I will use that transformed data)

data.fert$ave_fert_proport.t <- data.fert$ave_fert_proport - 0.001

Inspect data

data.fert$ave_fert_proport.t %>% hist()

ggplot(data.fert, aes(group=phylum, col=phylum)) + geom_density(aes(ave_fert_proport.t))
## Warning: Removed 13 rows containing non-finite values (stat_density).

Calculate weights for models

NOTE: not working due to missing data.

weights <- metafor::escalc(measure='MN',
                mi=data.fert$ave_fert_percent_p_h,
                sdi = data.fert$SD,
                ni=data.fert$number_trials_p_h, options(na.action="na.pass"))  

Generate binomial models

test1 <- glmmTMB(ave_fert_proport ~ p_h_experimental + taxonomic_group/phylum + (1|study), data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in fitTMB(TMBStruc): Model convergence problem; non-positive-definite
## Hessian matrix. See vignette('troubleshooting')
test2 <- glmmTMB(ave_fert_proport ~ p_h_experimental*phylum + (1|study), data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
test3 <- glmmTMB(ave_fert_proport ~ p_h_experimental:phylum + (1|study), data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
test4 <- glmmTMB(ave_fert_proport ~ p_h_experimental + phylum + (1|study), data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
test5 <- glmmTMB(ave_fert_proport ~ p_h_experimental + (1|study), data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
test6 <- glmmTMB(ave_fert_proport ~ (1|study), data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
test7 <- glmmTMB(ave_fert_proport ~ phylum + (1|study), data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
test8 <- glmmTMB(ave_fert_proport ~ p_h_experimental, data=data.fert, binomial(link = "logit"), na.action=na.exclude)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Determine best fit model

AIC(test1, test2, test3, test4, test5, test6, test7, test8) #test2 smallest AIC.  
## Warning in AIC.default(test1, test2, test3, test4, test5, test6, test7, : models
## are not all fitted to the same number of observations
##       df      AIC
## test1 42       NA
## test2  9 373.7304
## test3  6 369.9990
## test4  6 370.1405
## test5  3 368.4748
## test6  2 404.6719
## test7  5 406.9653
## test8  2 393.3454
#test differene between models test3 and test4. Stick with test5 model. 
anova(test5, test4) 
## Data: data.fert
## Models:
## test5: ave_fert_proport ~ p_h_experimental + (1 | study), zi=~0, disp=~1
## test4: ave_fert_proport ~ p_h_experimental + phylum + (1 | study), zi=~0, disp=~1
##       Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## test5  3 368.47 379.67 -181.24   362.47                         
## test4  6 370.14 392.54 -179.07   358.14 4.3343      3     0.2276
anova(test5, test3) 
## Data: data.fert
## Models:
## test5: ave_fert_proport ~ p_h_experimental + (1 | study), zi=~0, disp=~1
## test3: ave_fert_proport ~ p_h_experimental:phylum + (1 | study), zi=~0, disp=~1
##       Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## test5  3 368.47 379.67 -181.24   362.47                         
## test3  6 370.00 392.40 -179.00   358.00 4.4758      3     0.2145
anova(test5, test2) 
## Data: data.fert
## Models:
## test5: ave_fert_proport ~ p_h_experimental + (1 | study), zi=~0, disp=~1
## test2: ave_fert_proport ~ p_h_experimental * phylum + (1 | study), zi=~0, disp=~1
##       Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## test5  3 368.47 379.67 -181.24   362.47                         
## test2  9 373.73 407.33 -177.87   355.73 6.7444      6     0.3451
anova(test8, test5) 
## Data: data.fert
## Models:
## test8: ave_fert_proport ~ p_h_experimental, zi=~0, disp=~1
## test5: ave_fert_proport ~ p_h_experimental + (1 | study), zi=~0, disp=~1
##       Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## test8  2 393.35 400.81 -194.67   389.35                             
## test5  3 368.47 379.67 -181.24   362.47 26.871      1  2.175e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Examine model test5
car::Anova(test5) #phylum not quite significant factor, pH is a sign. factor. 
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: ave_fert_proport
##                   Chisq Df Pr(>Chisq)    
## p_h_experimental 17.098  1  3.549e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(test5)
##  Family: binomial  ( logit )
## Formula:          ave_fert_proport ~ p_h_experimental + (1 | study)
## Data: data.fert
## 
##      AIC      BIC   logLik deviance df.resid 
##    368.5    379.7   -181.2    362.5      306 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev.
##  study  (Intercept) 0.8953   0.9462  
## Number of obs: 309, groups:  study, 35
## 
## Conditional model:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -13.0110     3.3241  -3.914 9.07e-05 ***
## p_h_experimental   1.7928     0.4336   4.135 3.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Generate estimates & confidence intervals (log likelihood)
confint(test5)
##                                      2.5 %    97.5 %    Estimate
## cond.(Intercept)               -19.5261217 -6.495904 -13.0110131
## cond.p_h_experimental            0.9430309  2.642588   1.7928096
## study.cond.Std.Dev.(Intercept)   0.6096868  1.468417   0.9461896
# Instpect residuals ~ fitted values 
aa5 <- augment(test5, data=data.fert)
## Warning in mr - fitted(object): longer object length is not a multiple of
## shorter object length
ggplotly(ggplot(aa5, aes(x=.fitted,y=.resid)) + 
    geom_point() + 
  geom_smooth())
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).

Generate model predictions and plot against real data

ph.min.max <- data.fert %>% 
  select(phylum, p_h_experimental) %>% 
  group_by(phylum) %>% 
  summarize(min=min(p_h_experimental, na.rm=TRUE), max=max(p_h_experimental, na.rm=TRUE))
  
phylum.list <- list()
for (i in 1:nrow(ph.min.max)) {
  phylum.list[[i]] <- data.frame(ph=c(seq(from=as.numeric(ph.min.max[i,"min"]), 
                            to=as.numeric(ph.min.max[i,"max"]), 
                            by=0.01)),
                   phylum=rep(c(ph.min.max[i,"phylum"])))
}
new.data <- bind_rows(phylum.list) %>% purrr::set_names(c("p_h_experimental", "phylum"))
new.data$study <- NA 

predict.test.df <- predict(test5, newdata = new.data, se.fit = TRUE, type="response")
predict.test.df.df <- predict.test.df %>%
  as.data.frame() %>%
  cbind(new.data)

#scales::show_col(c("#e41a1c","#4daf4a","#ff7f00","#984ea3",'#377eb8'))

# Data with beta regression model fit 
ggplotly(ggplot() + 
  geom_jitter(data=data.fert, aes(x=p_h_experimental, y=ave_fert_proport, group=phylum, col=phylum), size=1.2, width=0.03) +
  #facet_wrap(~phylum, scales="free") + theme_minimal() +
  ggtitle("% fertilization ~ pH with binomial-regression model predictions") + 
  xlab("Experimental pH") + ylab("Fertilization %") +
  scale_color_manual(name=NULL, values=c("#e41a1c","#ff7f00","#4daf4a",'#377eb8')) +
  theme_minimal() + 
  coord_cartesian(ylim = c(0, 1), xlim =c(6,8.5)) +
  geom_line(data = predict.test.df.df, aes(x=p_h_experimental, y=fit), col="gray50") + #, col=phylum
  geom_ribbon(data = predict.test.df.df, aes(x=p_h_experimental, ymin=fit-se.fit, ymax=fit+se.fit), linetype=2, alpha=0.1, col="gray50")) #, fill=phylum